knitr::opts_chunk$set(echo = TRUE)
Blah
library(tidyverse) library(igraph) library(tictoc)
members_df <- read_csv("./linkedin_data/common_connection_200k.csv") members_sample_df <- members_df %>% sample_n(1000000) # take a sample members_small_sample_df <- members_df %>% sample_n(100000) # take a sample members_sample_graph <- members_sample_df %>% graph_from_data_frame(directed = FALSE) members_small_sample_graph <- members_small_sample_df %>% graph_from_data_frame(directed = FALSE)
print("Number of unique vertices:") vertex_attr(members_sample_graph)$name %>% unique() %>% length()
Playing around with igraph we can use the graph to extract the original data frame:
edges_df <- get.edgelist(members_sample_graph) %>% as.tibble() %>% rename(member = V1, member_conn = V2) edges_df %>% arrange(member, member_conn)
Algorithm 1 (finds node with most ) in English:
1. Get Member.
2. Get Member's Connections.
3. For Each Connection, Get List of Connections Connections.
4. Check If Connections Connections Are In Member's Connections.
5. If Yes: Increment & Store Counter for Member vs Connections.
6. Repeat 1-5 Until All Membership Pairs Are Accounted For.
Number of unique member names vs total number of member names in sample:
edges_df$member %>% unique() %>% length() edges_df$member %>% length() rm(edges_df)
The following bit will print for each member the number of friends who's friends are also friends of the member:
first_member <- members_df %>% filter(member_id == "150503") for(conn in first_member$connected_member_id) { conn_conns <- members_df %>% filter(member_id == conn) # print(conn_conns) (conn_conns$connected_member_id %in% first_member$connected_member_id) %>% sum(na.rm = TRUE) %>% print() }
tictoc::tic("Total") tictoc::tic("Create Nested data.frame") nested_sample_df <- members_sample_df %>% nest(connected_member_id) %>% mutate(connected_member_ids = map(data, function(x) x$connected_member_id) ) %>% select(-data) tictoc::toc() first_member <- nested_sample_df %>% filter(member_id == "67890") tictoc::tic("Calculate Number of Connections") for(friends in first_member$connected_member_ids[[1]]){ sample_less_friends <- nested_sample_df %>% filter(!(member_id %in% union(first_member$member_id, friends))) sample_less_friends <- sample_less_friends %>% mutate(common_friends = map2_int(connected_member_ids, friends, function(x, y) {sum(x %in% y, na.rm = TRUE)} )) } tictoc::toc() tictoc::toc()
create_nested_member_dataframe <- function(member_data) { member_data %>% nest(connected_member_id) %>% mutate(connected_member_ids = map(data, function(x) x$connected_member_id) ) %>% select(-data) } nested_members_df <- create_nested_member_dataframe(members_df)
calculate_num_common_friends <- function(member_data, member_name){ first_member <- member_data %>% filter(member_id == member_name) for(friends in first_member$connected_member_ids[[1]]){ sample_less_friends <- member_data %>% filter(!(member_id %in% union(first_member$member_id, friends))) sample_less_friends <- sample_less_friends %>% mutate(common_friends = map2_int(connected_member_ids, friends, function(x, y) {sum(x %in% y, na.rm = TRUE)} )) } sample_less_friends } tictoc::tic("Bad Algo 2") calculate_num_common_friends(nested_members_df, "67890") tictoc::toc()
Attempting to find friends of friends for a large sample (about 1*10^6 edges) took well over 2 hours to compute using the igraph package to calculate the neighbourhoods. I suspect the computation time is worse than O(n) so it could take over a day to complete this calculation for the full dataset.
Worryingly, the setdiff
between degree 1 and 2 neighbourhoods is returning the empty set which seems very wrong. Given we have sampled about 1/10 of the full dataset it seems unlikely that we would not find any friends of friends!
tictoc::tic("Total") tictoc::tic("1st Degree Neighbourhood") neighbours_1_deg <- unlist(neighborhood(members_sample_graph, 1)) tictoc::toc() tictoc::tic("2nd Degree Neighbourgood") neighbours_2_deg <- unlist(neighborhood(members_sample_graph, 2)) tictoc::toc() tictoc::tic("Friends of Friends") second_deg_conns <- setdiff(neighbours_2_deg, neighbours_1_deg) tictoc::toc() tictoc::toc()
tic("Create friend of friend list") f_of_f_small_sample <- ego(members_small_sample_graph, order = 2, mindist = 2, mode = "all") toc() # tic("Create friend of friend list") # f_of_f_sample <- ego(members_sample_graph, order = 2, mindist = 2, mode = "all") # toc()
tic("Create fof dataframe") tic("Create nested df") f_of_f_small_sample_df <- tibble(member_id = V(members_small_sample_graph)$name, friend_of_friend = map(f_of_f_small_sample, function(x) x$name)) toc() tic("Unnest df") f_of_f_small_sample_df <- unnest(f_of_f_small_sample_df, friend_of_friend) toc() # Direction does not matter here. tic("Remove duplicate relationships") f_of_f_small_sample_df <- f_of_f_small_sample_df %>% mutate(sorted_set = map2_chr(member_id, friend_of_friend, function(x, y) paste(sort(c(x, y)), collapse = "") ) ) %>% distinct(sorted_set, .keep_all = TRUE) %>% select(-sorted_set) %>% mutate_all(as.integer) toc() toc()
Now we have a unique row for each friend of a friend connection. We can use this to look up the original member's direct connections and the friend of friend's direct connections, and count the amount of matches. Key to this will be speed of lookup of the direct connections.
I should be using microbenchmark
or similar package here, but tictoc
should give us a good feel for the scale of the differences between lookup methods.
tic("Members lookup dplyr") members_df %>% filter(member_id == f_of_f_small_sample_df[10000, ]$member_id) toc() tic("Members lookup base") members_df[members_df$member_id == f_of_f_small_sample_df[10000, ]$member_id, ] toc()
This kind of time could be a problem for a large number of calculations. Let's try using the indexing feature from data.table
.
tic("Use data.table") tic("Members convert to data.table") members_dt <- data.table::as.data.table(members_df) toc() tic("Add index") data.table::setkey(members_dt, member_id) toc() tic("Simple lookup") members_dt[member_id == f_of_f_small_sample_df[10000, ]$member_id] toc() toc()
It seems we might be able to reduce the lookup time by roughly a factor of 2 over base or dplyr, a definitive answer would require better use of benchmarking tools to get a sample of lookups.
For now we have settled on using an indexed version of the members data frame, using data.table. We can use this method to efficiently lookup the friends from each member and from the friends of friends of member.
tic("Count common connections per relationship") sum(members_dt[member_id == f_of_f_small_sample_df[10000, ]$member_id]$connected_member_id %in% members_dt[member_id == f_of_f_small_sample_df[10000, ]$friend_of_friend]$connected_member_id, na.rm = TRUE) toc()
So for one set of lookups and calculation of common connections on the full indexed data.table it took about 0.04 seconds. This means the lookups for the small sample (100,000 members), we're looking at about 3 hours of calcuations. 2 lookups on an indexed table should scale as roughly O(log(n)), but this leaves us in trouble as we had 260k friend of a friend connections from 100k sample of members. The solution time could baloon in time and possibly memory when we bring in larger samples.
paste((0.04 / 3600) * 260337, "hours")
f_of_f_small_sample <- ego(members_small_sample_graph, order = 2, mindist = 2, mode = "all") get_friends_of_friends <- function(igraph_connections, friends_of_friends_vertices) { # takes an igraph object of all connections and a list of vertices of all friends of friends # calculated by igraph::ego(igraph_obj, order = 2, mindist = 2, mode = "all") friends_of_friends_df <- tibble(member_id = V(igraph_connections)$name, friend_of_friend = map(friends_of_friends_vertices, function(x) x$name)) friends_of_friends_df <- unnest(friends_of_friends_df, friend_of_friend) friends_of_friends_df %>% mutate(sorted_set = map2_chr(member_id, friend_of_friend, function(x, y) paste(sort(c(x, y)), collapse = ""))) %>% distinct(sorted_set, .keep_all = TRUE) %>% select(-sorted_set) %>% mutate_all(as.integer) } get_friends_of_friends(members_small_sample_graph, f_of_f_small_sample) tic("Sample full calculation") f_of_f_small_sample_df <- f_of_f_small_sample_df %>% mutate(count_common_connections = map2_int(member_id, friend_of_friend, calc_num_common_conns, members_dt)) toc() calc_num_common_conns <- function(member, f_of_f, member_conns) { sum(member_conns[member_id == member]$connected_member_id %in% member_conns[member_id == f_of_f]$connected_member_id, na.rm = TRUE) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.